#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#       Replication file (double selection):                               #
#       D�r & M�dlhamer (2021)                                             #
#       Power and Innovative Capacity: Explaining Variation in             #
#       Intellectual Property Rights Regulation across Trade Agreements    #
#       International Interactions                                         #
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


rm(list=ls())
setwd("ADD PERSONAL DIRECTORY") 
sink("replication_output_doubleselection.txt", append=FALSE, split=TRUE, type = c("output", "message"))


# load packages -----------------------------------------------------------


library(pacman)
p_load("tidyverse", "stringi", "countrycode", "prediction", "sampleSelection", 
       "stargazer", "reshape2")


# load data -----------------------------------------------------------


d1 <- read_csv("replication_data_crosssection_duer_moedlhamer.csv", col_types = cols(number = col_character()), guess_max=19000)


# create some variables --------------------------------------------------


d1$gdp_fill_diff_max_ln <- log(d1$gdp_fill_diff_max)
d1$patents_fill_diff_max_nona_ln <- log(d1$patents_fill_diff_max_nona+1)
d1$gdppc_fill_diff_max_ln <- log(d1$gdppc_fill_diff_max)
d1$ptalength_ln <- log(d1$ptalength)
d1$year_count <- d1$year - min(d1$year, na.rm=TRUE)


# descriptive statistics (Table A3 in the Appendix) --------------------------------------------------


stargazer(as.data.frame(d1[,c("pta_exist", "ipr_included", "lsa1", "patents_fill_diff_max_nona_ln", 
                  "gdp_fill_diff_max_ln", "emerging", "gdppc_fill_diff_max_ln", 
                "polity_fill_mean","wto_memb", "ptalength_ln", 
                "contig", "distance_km", "comlang_off", "colony", "comrelig",
                  "tbt_prov")]),
          covariate.labels = c("PTA", "IPR included", "IPR depth",  
                               "Patents (diff., log.)", 
                               "GDP (diff., log.)", "BRIC countries", "GDPpc (diff., log.)", 
                               "Democracy (mean)", "WTO member", "Word count (PTA, log.)", 
                               "Contiguity", "Distance (in km)", "Common language", "Colony", "Common religion",
                               "TBT provision"),
          digits=2, median=TRUE, type = "latex", label="tab_desc_double", font.size = "footnotesize", 
          out="tableA3_descriptivestatistics_double_selection.tex",
          title = "Descriptive statistics (Double selection)", omit.summary.stat=c("p25", "p75")) 


# run models ---------------------------------------------------------------


data_na <- d1[!is.na(d1$patents_fill_diff_max_nona),]
data_na <- data_na[!is.na(data_na$polity_fill_mean),]
data_na <- data_na[!is.na(data_na$gdppc_fill_diff_max),]
data_na <- data_na[!is.na(data_na$comrelig),]

## double selection ---------------------------------------

summary(mp <- probit(pta_exist ~ 
                       log(gdp_fill_diff_max) * log(patents_fill_diff_max_nona+1) +
                       emerging +
                       log(gdppc_fill_diff_max) + 
                       polity_fill_mean + 
                       wto_memb + 
                       contig +
                       distance_km +
                       comlang_off +
                       colony +
                       comrelig +
                       NULL,
                     data=data_na))

data_na$IMR <- invMillsRatio(mp)$IMR1

mh1 <- heckit(ipr_included ~ 
                log(gdp_fill_diff_max) * log(patents_fill_diff_max_nona+1) +
                emerging + 
                log(gdppc_fill_diff_max) +
                polity_fill_mean +
                wto_memb +
                log(ptalength) +
                year_count +
                tbt_prov +
                IMR + 
                NULL,
              lsa1 ~
                log(gdp_fill_diff_max) * log(patents_fill_diff_max_nona+1) +
                emerging + 
                log(gdppc_fill_diff_max) + 
                polity_fill_mean + 
                wto_memb + 
                log(ptalength) + 
                year_count +
                NULL, 
              data=data_na, method="2step")
summary(mh1)

## only one selection, but with PTA instead of IPR as selection ---------------------------------------

data_na$lsa2 <- ifelse(is.na(data_na$lsa) & data_na$pta_exist==1, 0, data_na$lsa)
data_na$lsa3 <- data_na$lsa2/max(data_na$lsa2, na.rm=T) * 100
mh2 <- heckit(pta_exist ~ 
                log(gdp_fill_diff_max) * log(patents_fill_diff_max_nona+1) +
                emerging + 
                log(gdppc_fill_diff_max) +
                polity_fill_mean +
                wto_memb +
                contig +
                distance_km +
                comlang_off +
                colony +
                comrelig +
                NULL,
              lsa3 ~
                log(gdp_fill_diff_max) * log(patents_fill_diff_max_nona+1) +
                emerging + 
                log(gdppc_fill_diff_max) + 
                polity_fill_mean + 
                wto_memb + 
                log(ptalength) + 
                year_count +
                NULL, 
              data=data_na, method="2step")
summary(mh2)


# exporting models (Table A4 in the Appendix) ---------------------------------------------------------------


#this is a trick to export both parts of model via stargazer
mh1x <- mh1
mh1x$param$index$betaO <- mh1$param$index$betaS
mh1x$param$index$betaS <- mh1$param$index$betaO

mh2x <- mh2
mh2x$param$index$betaO <- mh2$param$index$betaS
mh2x$param$index$betaS <- mh2$param$index$betaO

stargazer(mp, mh1x, mh1, mh2x, mh2,          
          column.labels = c("Selection eq. 1", "Selection eq. 2", "Outcome eq. (Overall)", "Selection eq.", "Outcome eq. (Overall)"), 
          digits=2, 
          type = "latex", 
          header = FALSE,
          order=c(1:2, 16, 3:15),
          covariate.labels=c("GDP (diff., log.)",
                             "Patents (diff., log.)",
                             "Patents (diff., log.) x GDP (diff., log.)", 
                             "BRIC countries",
                             "GDPpc (diff., log.)",
                             "Democracy (mean)",
                             "WTO member",
                             "Contiguity",
                             "Distance (in km)",
                             "Common language",
                             "Colony",
                             "Common religion",
                             "Word count (PTA, log.)",
                             "Year (count)",
                             "TBT provision",
                             "IMR (sel. 1)"),
          dep.var.labels.include = FALSE,
          title = "Double selection model",
          label = "tab_double_selection",
          single.row = FALSE,
          font.size = "footnotesize",
          dep.var.caption = "",
          omit.stat = c("f","ser", "chi2"),
          no.space = TRUE,
          out="tableA4_double_selection.tex")


# Session info ------------------------------------------------------------


sessionInfo()
sink()
#writeLines(capture.output(sessionInfo()), "sessionInfo_doubleselection.txt")